import json
import datetime
import geopandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
school = geopandas.read_file("../data/geo/nyc_school_districts.geojson")
census = geopandas.read_file("../data/geo/nyc_2010_tracts_formatted.geojson")
nyc = pd.read_csv("../data/raw/nyc_evictions_geocoded.csv")
nyc.columns = nyc.columns.str.lower()
nyc[['lon', 'lat']] = nyc.lon_lat.str.split(',', expand=True)
nyc[['eviction_address','lon_lat','lon','lat']].head()
| eviction_address | lon_lat | lon | lat | |
|---|---|---|---|---|
| 0 | 454 EAST 105TH ST | -73.93865,40.788143 | -73.93865 | 40.788143 |
| 1 | 601 WEST 189TH ST | -73.930176,40.85436 | -73.930176 | 40.85436 |
| 2 | 2607 AVENUE O | -73.94823,40.61415 | -73.94823 | 40.61415 |
| 3 | 726 WILLOUGHBY AVE | -73.93993,40.69493 | -73.93993 | 40.69493 |
| 4 | 945 SARATOGA AVENUE | -73.91456,40.65798 | -73.91456 | 40.65798 |
nyc['executed_date'] = pd.to_datetime(nyc['executed_date'])
nyc['executed_year'] = nyc['executed_date'].dt.year
nyc['executed_month'] = nyc['executed_date'].dt.to_period('M')
# nyc['executed_week'] = nyc['executed_date'].dt.isocalendar().week
nyc['executed_date'].dt
<pandas.core.indexes.accessors.DatetimeProperties object at 0x122187d90>
nyc[nyc['executed_year'] == 2070]
| court_index_number | docket_number | eviction_address | eviction_apt_num | executed_date | marshal_first_name | marshal_last_name | residential_commercial_ind | borough | eviction_zip | ... | tiger_line_id | side | state_code | county_code | tract_code | block_code | lon | lat | executed_year | executed_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 54351 | 28748/16 | 317112 | 2714 BAINBRIDGE AVENUE | 4A | 2070-03-29 | John | Villanueva | Residential | BRONX | 10458 | ... | 80298904.0 | R | 36.0 | 5.0 | 40502.0 | 2000.0 | -73.89097 | 40.86593 | 2070 | 2070-03 |
1 rows × 27 columns
nyc = nyc[nyc['executed_year'] != 2070].reset_index(drop=True)
nyc to GeoDataFrame¶nyc_gdf = geopandas.GeoDataFrame(nyc, geometry=geopandas.points_from_xy(nyc.lon, nyc.lat, crs=census.crs))
nyc_school = geopandas.sjoin(nyc_gdf, school, op='within',how='left')
nyc_school_total = nyc_school.groupby(['school_dist']).size().reset_index()
nyc_school_total.columns = ['school_dist', 'num_evictions']
with open('../data/geo/nyc_school_districts.geojson') as json_file:
sdGeoJson = json.load(json_file)
fig = px.choropleth_mapbox(nyc_school_total, geojson=sdGeoJson, color="num_evictions",
color_discrete_sequence = px.colors.qualitative.Dark24,
locations="school_dist", featureidkey="properties.school_dist",opacity=0.5)
fig.update_layout(
height=800,
title="Accumulative Number of Evictions by School District since 2017",
mapbox_style="streets",
mapbox_accesstoken='pk.eyJ1IjoiZGdpbGxlbiIsImEiOiJjam85OGFvaXIxZXRlM2tubG8zY3E0OHh1In0.KkjAoFhjOOFjXAEuZ1IRog',
mapbox_zoom=9,
mapbox_center = {"lat": 40.75, "lon": -73.9}
)
fig.show()
nyc_school_year = nyc_school.groupby(['school_dist', 'executed_year']).size().reset_index()
nyc_school_year.columns = ['school_dist', 'executed_year', 'num_evictions']
# fig = px.choropleth_mapbox(
# nyc_school_year, geojson=sdGeoJson, color="num_evictions",
# color_discrete_sequence = px.colors.qualitative.Dark24, range_color=[0, 2000],
# locations="school_dist", featureidkey="properties.school_dist", opacity=0.5,
# animation_frame='executed_year')
# fig.update_layout(
# height=800,
# title="Yearly Number of Evictions by School District",
# mapbox_style="streets",
# mapbox_accesstoken='pk.eyJ1IjoiZGdpbGxlbiIsImEiOiJjam85OGFvaXIxZXRlM2tubG8zY3E0OHh1In0.KkjAoFhjOOFjXAEuZ1IRog',
# mapbox_zoom=9,
# mapbox_center = {"lat": 40.75, "lon": -73.9}
# )
# fig.show()
nyc_school_month = nyc_school.groupby(['school_dist', 'executed_month']).size().reset_index()
nyc_school_month['executed_month'] = nyc_school_month['executed_month'].astype(str)
nyc_school_month.columns = ['school_dist', 'executed_month', 'num_evictions']
# fig = px.choropleth_mapbox(
# nyc_school_month, geojson=sdGeoJson, color="num_evictions",
# color_discrete_sequence = px.colors.qualitative.Dark24, range_color=[0, 250],
# locations="school_dist", featureidkey="properties.school_dist", opacity=0.5,
# animation_frame='executed_month')
# fig.update_layout(
# height=800,
# title="Monthly Number of Evictions by School District",
# mapbox_style="streets",
# mapbox_accesstoken='pk.eyJ1IjoiZGdpbGxlbiIsImEiOiJjam85OGFvaXIxZXRlM2tubG8zY3E0OHh1In0.KkjAoFhjOOFjXAEuZ1IRog',
# mapbox_zoom=9,
# mapbox_center = {"lat": 40.75, "lon": -73.9}
# )
# fig.show()
from gluonts.dataset.common import ListDataset
from gluonts.dataset.util import to_pandas
from gluonts.model.simple_feedforward import SimpleFeedForwardEstimator
from gluonts.mx.trainer import Trainer
from gluonts.evaluation.backtest import make_evaluation_predictions
# Generate time-series df
nyc_school_month['school_dist'] = nyc_school_month['school_dist'].astype(int)
table = nyc_school_month.pivot(index='school_dist', columns='executed_month', values='num_evictions')
table = table[table.columns[:-2]]
table = table.T
table.head()
| school_dist | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| executed_month | |||||||||||||||||||||
| 2017-01 | 22.0 | 77.0 | 26.0 | 22.0 | 48.0 | 60.0 | 46.0 | 80.0 | 165.0 | 204.0 | ... | 44.0 | 65.0 | 30.0 | 16.0 | 59.0 | 44.0 | 55.0 | 45.0 | 57.0 | 28.0 |
| 2017-02 | 11.0 | 53.0 | 13.0 | 16.0 | 40.0 | 42.0 | 37.0 | 49.0 | 102.0 | 153.0 | ... | 34.0 | 47.0 | 24.0 | 6.0 | 54.0 | 37.0 | 34.0 | 30.0 | 59.0 | 18.0 |
| 2017-03 | 9.0 | 59.0 | 24.0 | 18.0 | 48.0 | 53.0 | 42.0 | 74.0 | 108.0 | 186.0 | ... | 48.0 | 46.0 | 28.0 | 3.0 | 75.0 | 72.0 | 41.0 | 44.0 | 59.0 | 16.0 |
| 2017-04 | 10.0 | 61.0 | 33.0 | 23.0 | 36.0 | 66.0 | 53.0 | 53.0 | 87.0 | 174.0 | ... | 12.0 | 51.0 | 30.0 | 13.0 | 37.0 | 42.0 | 42.0 | 37.0 | 56.0 | 19.0 |
| 2017-05 | 16.0 | 64.0 | 40.0 | 16.0 | 28.0 | 53.0 | 50.0 | 70.0 | 151.0 | 214.0 | ... | 37.0 | 57.0 | 34.0 | 8.0 | 66.0 | 53.0 | 45.0 | 33.0 | 63.0 | 25.0 |
5 rows × 32 columns
# Generate train & test set
train_set = [{
'start': pd.Timestamp('2017-01', freq='M'),
'target': table[i].values[:35],
'item_id': i,
} for i in range(1, 33)]
test_set = [{
'start': pd.Timestamp('2017-01', freq='M'),
'target': table[i].values,
'item_id': i,
} for i in range(1, 33)]
train_set = ListDataset(train_set, freq='M')
test_set = ListDataset(test_set, freq='M')
train_entry = next(iter(train_set))
test_entry = next(iter(test_set))
train_series = to_pandas(train_entry)
test_series = to_pandas(test_entry)
fig, ax = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(15, 8))
fig.suptitle(f"School District {train_entry['item_id']}", fontsize=16)
train_series.plot(ax=ax[0])
ax[0].grid(which="both")
ax[0].legend(["train series"], loc="upper left")
test_series.plot(ax=ax[1])
ax[1].axvline(train_series.index[-1], color='r') # end of train dataset
ax[1].grid(which="both")
ax[1].legend(["test series", "end of train series"], loc="upper left")
plt.show()
estimator = SimpleFeedForwardEstimator(
num_hidden_dimensions=[10],
prediction_length=4,
freq='M',
trainer=Trainer(ctx="cpu",
epochs=10,
learning_rate=1e-3,
hybridize=False,
num_batches_per_epoch=100
)
)
predictor = estimator.train(train_set)
0%| | 0/100 [00:00<?, ?it/s]
learning rate from ``lr_scheduler`` has been overwritten by ``learning_rate`` in optimizer.
100%|██████████| 100/100 [00:01<00:00, 91.43it/s, epoch=1/10, avg_epoch_loss=4.57] 100%|██████████| 100/100 [00:01<00:00, 93.45it/s, epoch=2/10, avg_epoch_loss=4.06] 100%|██████████| 100/100 [00:01<00:00, 92.77it/s, epoch=3/10, avg_epoch_loss=3.99] 100%|██████████| 100/100 [00:01<00:00, 90.14it/s, epoch=4/10, avg_epoch_loss=3.96] 100%|██████████| 100/100 [00:01<00:00, 91.39it/s, epoch=5/10, avg_epoch_loss=3.94] 100%|██████████| 100/100 [00:01<00:00, 90.62it/s, epoch=6/10, avg_epoch_loss=3.93] 100%|██████████| 100/100 [00:01<00:00, 93.78it/s, epoch=7/10, avg_epoch_loss=3.92] 100%|██████████| 100/100 [00:01<00:00, 95.63it/s, epoch=8/10, avg_epoch_loss=3.94] 100%|██████████| 100/100 [00:01<00:00, 95.07it/s, epoch=9/10, avg_epoch_loss=3.96] 100%|██████████| 100/100 [00:01<00:00, 96.48it/s, epoch=10/10, avg_epoch_loss=3.93]
forecast_it, ts_it = make_evaluation_predictions(
dataset=test_set, # test dataset
predictor=predictor, # predictor
num_samples=100, # number of sample paths we want for evaluation
)
forecasts = list(forecast_it)
tss = list(ts_it)
# first entry of the time series list
ts_entry = tss[0]
# first entry of the forecast list
forecast_entry = forecasts[0]
print(f"Number of sample paths: {forecast_entry.num_samples}")
print(f"Dimension of samples: {forecast_entry.samples.shape}")
print(f"Start date of the forecast window: {forecast_entry.start_date}")
print(f"Frequency of the time series: {forecast_entry.freq}")
Number of sample paths: 100 Dimension of samples: (100, 4) Start date of the forecast window: 2019-12-31 00:00:00 Frequency of the time series: M
print(f"Mean of the future window:\n {forecast_entry.mean}")
print(f"0.5-quantile (median) of the future window:\n {forecast_entry.quantile(0.5)}")
Mean of the future window: [5.6933813 6.1442003 5.907758 6.2272615] 0.5-quantile (median) of the future window: [5.5447927 6.3230925 5.789105 6.126518 ]
def plot_prob_forecasts(ts_entry, forecast_entry):
plot_length = 150
prediction_intervals = (50.0, 90.0)
legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
ts_entry[-plot_length:].plot(ax=ax) # plot the time series
forecast_entry.plot(prediction_intervals=prediction_intervals, color='g')
plt.grid(which="both")
plt.legend(legend, loc="upper left")
plt.show()
plot_prob_forecasts(ts_entry, forecast_entry)